# -*- coding: utf-8 -*-
"""
Created on Sat Feb 28 16:38:44 2026

@author: Wieczorek_W_Station
"""

import os
from sentence_transformers import SentenceTransformer

import operator as op
import pandas as pd
import pickle
import numpy as np

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

#%%
# =============================================================================
# Load Data
# =============================================================================
root = "C:\\Users\\Wieczorek_W_Station\\Dropbox\\Arbeit Kassel\\paperideen\\Moltbook_Science\\Data\\"
path = os.path.join(root,"PreparedData")
molts = os.path.join(root,"Molts")
outputs = os.path.join(root,"Outputs")
figures = os.path.join(root,"Figures")

for p in [outputs,figures]:
    try:
        os.makedirs(p)
    except:
        pass


## load embeddings
os.chdir(path)
threadsEmbeddings = pickle.load(open("ThreadEmbeddingsDict.pickle","rb"))
commentEmbeddings = pickle.load(open("commentEmbeddingsDict.pickle","rb"))
combinedEmbeddings = threadsEmbeddings + commentEmbeddings

del(threadsEmbeddings, commentEmbeddings)

#%%

documents = [x["split_content"] for x in combinedEmbeddings]
documents = [item for sublist in documents for item in sublist]
embedding_model = SentenceTransformer("allenai/scibert_scivocab_uncased")

# embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(
    documents,
    batch_size=64,
    show_progress_bar = True)

pickle.dump(embeddings, open("AllembeddingsSciBERT.pickle","wb"))
embeddings = pickle.load(open("AllembeddingsSciBERT.pickle","rb"))
# print(embeddings.shape)
# pickle.dump(embeddings, open("Allembeddings.pickle","wb"))
# embeddings = pickle.load(open("Allembeddings.pickle","rb"))
## conduct BERTopic
topicsDF = []
coherences = []
cosSimilaritiesDfs = []


ctfidf = ClassTfidfTransformer(reduce_frequent_words = True)
representationModel = KeyBERTInspired(nr_repr_docs=3)
vectorizer = CountVectorizer(ngram_range = (1,3),
                             min_df = 2,
                             stop_words="english", 
                             max_features = 10000)

umap = UMAP(densmap = True,
             n_neighbors=5,
             n_components=10,
             metric = "cosine",
             n_epochs = 500, 
             spread = 5,
             negative_sample_rate = 1,
             min_dist = 0.1,
             repulsion_strength = 5,
             local_connectivity = 2,
             random_state = 42,
             transform_seed = 42)

# umap = UMAP()
 
 
 ## initialize HDBSCAN
hdbscan = HDBSCAN(metric = "manhattan",
                  cluster_selection_method="eom",
                  # cluster_selection_method = "leaf",
                  approx_min_span_tree = False,
                  min_cluster_size= 10,
                  core_dist_n_jobs = 8)
# hdbscan = HDBSCAN()

bertopic = BERTopic(embedding_model=embedding_model,
                    umap_model=umap, 
                    hdbscan_model=hdbscan,
                    ctfidf_model=ctfidf,
                    representation_model=representationModel, 
                    vectorizer_model=None,
                    verbose=True)


## fit model on data
bertopicOutput = bertopic.fit(documents = documents,
                              embeddings = embeddings)

# ## create better topic representations
bertopic.update_topics(docs = documents,
                       vectorizer_model=vectorizer, 
                       top_n_words=10)

topics = bertopic.get_topic_info()
bertopic.merge_topics(documents, topics_to_merge = [[7,15],
                                                    [3,4]])
## merge topics 15 and 2, as they discuss the same issue
topics = bertopic.get_topic_info()


reducedEmbeddings = umap.fit_transform(embeddings)
os.chdir(figures)
viz = bertopic.visualize_documents(documents, 
                                      reduced_embeddings=reducedEmbeddings)
viz.write_html("MoltbookTopics.html")

#%%
# =============================================================================
# keep only documents with topic == 0 and re-establish BERT space 
# =============================================================================

topicAssignments = bertopicOutput.topics_

index = [i for i,t in enumerate(topicAssignments) if t == 0]

embeddingsSecondRun = embeddings[index]
getter =  op.itemgetter(*index)
documentsSecondRun = getter(documents)


## initialize UMAP
umap = UMAP(densmap = True,
             n_neighbors=15,
             n_components=10,
             metric = "cosine",
             n_epochs = 500, 
             spread = 5,
             negative_sample_rate = 5,
             min_dist = 0.0,
             repulsion_strength = 5,
             local_connectivity = 1,
             random_state = 42,
             transform_seed = 42)

## initialize HDBSCAN
hdbscan = HDBSCAN(metric = "manhattan",
                  cluster_selection_method="eom",
                  # cluster_selection_method = "leaf",
                  approx_min_span_tree = False,
                  min_cluster_size= 10,
                  core_dist_n_jobs = 8)
# hdbscan = HDBSCAN()

bertopicSecondRun = BERTopic(embedding_model=embedding_model,
                    umap_model=umap, 
                    hdbscan_model=hdbscan,
                    ctfidf_model=ctfidf,
                    representation_model=representationModel, 
                    vectorizer_model=None,
                    verbose=True)

bertopicOutputSecondRun = bertopicSecondRun.fit(documents = documentsSecondRun,
                                       embeddings = embeddingsSecondRun)
# ## create better topic representations
bertopicSecondRun.update_topics(docs = documentsSecondRun,
                       vectorizer_model=vectorizer, 
                       top_n_words=10)

topicsSecondRun = bertopicOutputSecondRun.get_topic_info()

reducedEmbeddings = umap.fit_transform(embeddingsSecondRun)

os.chdir(figures)
viz = bertopicSecondRun.visualize_documents(documentsSecondRun, 
                                     reduced_embeddings=reducedEmbeddings)
viz.write_html("MoltbookTopicsSecondRun.html")

#%%
# =============================================================================
# Save Topic Lists with exemplary texts
# =============================================================================
os.chdir(outputs)

topics.to_csv("TopicsEomFirstRound.csv", sep = ";")
topicsSecondRun.to_csv("TopicsLeafSecondRound.csv", sep = ";")
